Compare splice junctions against Baruzzo ground truth in human T1, T2, T3¶
In [1]:
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor
sample_ids = [
"baruzzo_human_t1r1", "baruzzo_human_t2r1", "baruzzo_human_t3r1",
"baruzzo_malaria_t1r1", "baruzzo_malaria_t2r1", "baruzzo_malaria_t3r1"
]
def filter_bam(input_bam, output_bam, mapq_threshold=10):
"""Filters BAM file to retain only reads with MAPQ >= specified threshold."""
command = ["samtools", "view", "-b", "-q", str(mapq_threshold), "-o", output_bam, input_bam]
# print(f"Filtering: {' '.join(command)}")
subprocess.run(command, check=True)
def parallel_filtering(aligner_list, sample_id, mapq_threshold=10, num_threads=4):
"""Filters BAM files in parallel using ThreadPoolExecutor."""
filtered_bam_dir = "/data/filtered_bams"
os.makedirs(filtered_bam_dir, exist_ok=True)
filtered_aligner_list = []
with ThreadPoolExecutor(max_workers=num_threads) as executor:
future_to_bam = {
executor.submit(
filter_bam, bam_file, os.path.join(filtered_bam_dir, f"{aligner}__{sample_id}.MAPQ{mapq_threshold}.bam"), mapq_threshold
): (aligner, bam_file)
for aligner, bam_file in aligner_list
}
for future in future_to_bam:
aligner, bam_file = future_to_bam[future]
try:
future.result() # Wait for the filtering process to complete
filtered_aligner_list.append([
aligner,
os.path.join(filtered_bam_dir, f"{aligner}__{sample_id}.MAPQ{mapq_threshold}.bam")
])
except Exception as e:
print(f"Error filtering {bam_file}: {e}")
return filtered_aligner_list
def run_featurecounts(aligner_list, genome_file, annotation_file, output_file, output_log, extra_flags, threads=8):
"""
Run featureCounts on a list of BAM files.
"""
bam_files = [bam[1] for bam in aligner_list]
command = [
'/home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts',
'-T', str(threads),
'-G', genome_file,
'-a', annotation_file,
'-o', output_file
] + extra_flags + bam_files
print("Running command: {}".format(" ".join(command)))
with open(output_log, "w") as output_log_file:
subprocess.run(command, check=True, stderr=output_log_file, stdout=output_log_file)
for MAPQ in ["10", "0"]:
if MAPQ == "10":
extra_flags = ["-J", "-O", "-Q", MAPQ, "-t", "exon", "-g", "gene_id", "-p", "--primary", "--maxMOp", "16"]
else:
extra_flags = ["-J", "-O", "-Q", MAPQ, "-t", "exon", "-g", "gene_id", "-p", "-M", "--maxMOp", "16"]
for sample_id in sample_ids:
print(f"Processing {sample_id} with MAPQ {MAPQ}")
aligner_list = [
["DeepSAP", f"/data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__{sample_id}.sorted.bam"],
["DRAGEN", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__{sample_id}.sorted.bam"],
["novoSplice", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__{sample_id}.sorted.bam"],
["STAR", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__{sample_id}.sorted.bam"],
["HISAT2", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__{sample_id}.sorted.bam"],
["Subjunc", f"/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__{sample_id}.sorted.bam"],
]
output_file = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_{sample_id}_{MAPQ}"
output_log = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_{sample_id}_{MAPQ}.log"
n_threads = 24
if "malaria" in sample_id:
genome_file = "/data/references/PFAL_Baruzzo/genome_sequence_pfal.fa"
annotation_file = '/data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF'
else:
genome_file = "/data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa"
annotation_file = '/data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF'
# Filter BAMs for MAPQ 10 before running featureCounts
if MAPQ == "10":
aligner_list = parallel_filtering(aligner_list, sample_id, mapq_threshold=10, num_threads=8) # Adjust `num_threads` as needed
run_featurecounts(aligner_list, genome_file, annotation_file, output_file, output_log, extra_flags, n_threads)
Processing baruzzo_human_t1r1 with MAPQ 10 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t1r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_human_t1r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_human_t1r1.MAPQ10.bam Processing baruzzo_human_t2r1 with MAPQ 10 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t2r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_human_t2r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_human_t2r1.MAPQ10.bam Processing baruzzo_human_t3r1 with MAPQ 10 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t3r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_human_t3r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_human_t3r1.MAPQ10.bam Processing baruzzo_malaria_t1r1 with MAPQ 10 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t1r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_malaria_t1r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_malaria_t1r1.MAPQ10.bam Processing baruzzo_malaria_t2r1 with MAPQ 10 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t2r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_malaria_t2r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_malaria_t2r1.MAPQ10.bam Processing baruzzo_malaria_t3r1 with MAPQ 10 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t3r1_10 -J -O -Q 10 -t exon -g gene_id -p --primary --maxMOp 16 /data/filtered_bams/DeepSAP__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/DRAGEN__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/novoSplice__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/STAR__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/HISAT2__baruzzo_malaria_t3r1.MAPQ10.bam /data/filtered_bams/Subjunc__baruzzo_malaria_t3r1.MAPQ10.bam Processing baruzzo_human_t1r1 with MAPQ 0 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t1r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t1r1.sorted.bam Processing baruzzo_human_t2r1 with MAPQ 0 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t2r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t2r1.sorted.bam Processing baruzzo_human_t3r1 with MAPQ 0 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/GRCh37_Baruzzo/GRCh37.primary_assembly.genome.fa -a /data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t3r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t3r1.sorted.bam Processing baruzzo_malaria_t1r1 with MAPQ 0 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t1r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_malaria_t1r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_malaria_t1r1.sorted.bam Processing baruzzo_malaria_t2r1 with MAPQ 0 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t2r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_malaria_t2r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_malaria_t2r1.sorted.bam Processing baruzzo_malaria_t3r1 with MAPQ 0 Running command: /home/fberakdar/anaconda3/envs/env_Benchmark_subread/bin/featureCounts -T 24 -G /data/references/PFAL_Baruzzo/genome_sequence_pfal.fa -a /data/references/PFAL_Baruzzo/simulator_config_geneinfo_pfal_GTF -o /data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_malaria_t3r1_0 -J -O -Q 0 -t exon -g gene_id -p -M --maxMOp 16 /data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_malaria_t3r1.sorted.bam /mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_malaria_t3r1.sorted.bam
In [1]:
from pdf2image import convert_from_path
from IPython.display import display, Image
from utility_sj import plot_grid_for_aligners_baruzzo, parse_featurecounts_junctions, plot_grid_for_aligners_baruzzo, process_featurecounts_and_novel_junctions, plot_grid_for_aligners, extract_cig_junctions, extract_cig_junctions, process_featurecounts_baruzzo , extract_custom_gtf_junctions
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau
from sklearn.linear_model import TheilSenRegressor
import numpy as np
aligners = ['DeepSAP', 'DRAGEN', 'novoSplice', 'STAR', 'HISAT2', 'Subjunc']
sample_ids = ["baruzzo_human_t1r1", "baruzzo_human_t2r1", "baruzzo_human_t3r1", "baruzzo_malaria_t1r1", "baruzzo_malaria_t2r1", "baruzzo_malaria_t3r1",]
sample_ids = [ "baruzzo_human_t1r1", "baruzzo_human_t2r1", "baruzzo_human_t3r1"]
for sample_id in sample_ids:
baruzzo_ground_truth = f'/data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/reads/{sample_id}.cig'
# baruzzo_junctions_df = extract_cig_junctions(baruzzo_ground_truth)
# baruzzo_junctions_df.to_csv(f'/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/ground_truth/{sample_id}_junctions_df_.csv', index=False)
baruzzo_junctions_df = pd.read_csv(f'/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/ground_truth/{sample_id}_junctions_df_.csv')
for MAPQ in ["0"]:
print(f"\nGenerating plot for {sample_id} and MAPQ {MAPQ}")
featurecounts_file = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_{sample_id}_{MAPQ}.jcounts"
counted_junctions_df = process_featurecounts_baruzzo(featurecounts_file)
counted_junctions_df.to_csv(f'plots/featureCounts/processed_counts/sj_counts_{sample_id}_{MAPQ}.csv', index=False)
# counted_junctions_df = pd.read_csv(f'plots/featureCounts/processed_counts/sj_counts_{sample_id}_{MAPQ}.csv')
matching_columns = ['Chromosome', 'Exon1_End', 'Exon2_Start']
merged_df = pd.merge(baruzzo_junctions_df, counted_junctions_df, on=matching_columns, how='outer')
merged_df = merged_df.fillna(0)
# Compute Pearson correlation and p-values
aligners = ['DeepSAP', 'DRAGEN', 'novoSplice', 'STAR', 'HISAT2', 'Subjunc']
correlation_results = []
pdf_path = f'plots/featureCounts/figures/aligner_vs_groundtruth_{sample_id}_MAPQ{MAPQ}.pdf'
png_path = f'plots/featureCounts/figures/aligner_vs_groundtruth_{sample_id}_MAPQ{MAPQ}.png'
# Plottting for final pdf
plot_grid_for_aligners_baruzzo(merged_df, aligners, pdf_path, log_scale=True)
# Plotting just to show plots
images = convert_from_path(pdf_path, dpi=300)
resized_image = images[0].resize((images[0].width // 2, images[0].height // 2))
resized_image.save(png_path, 'PNG')
display(Image(png_path))
Generating plot for baruzzo_human_t1r1 and MAPQ 0
DeepSAP: heil-Sen slope= 0.999998, 1.000 DRAGEN: heil-Sen slope= 0.956426, 0.956 novoSplice: heil-Sen slope= 0.978782, 0.979 STAR: heil-Sen slope= 0.957513, 0.958 HISAT2: heil-Sen slope= 0.999989, 1.000 Subjunc: heil-Sen slope= 0.999940, 1.000
Generating plot for baruzzo_human_t2r1 and MAPQ 0 DeepSAP: heil-Sen slope= 0.999965, 1.000 DRAGEN: heil-Sen slope= 0.945790, 0.946 novoSplice: heil-Sen slope= 0.965219, 0.965 STAR: heil-Sen slope= 0.910179, 0.910 HISAT2: heil-Sen slope= 0.879977, 0.880 Subjunc: heil-Sen slope= 0.948834, 0.949
Generating plot for baruzzo_human_t3r1 and MAPQ 0 DeepSAP: heil-Sen slope= 0.876132, 0.876 DRAGEN: heil-Sen slope= 0.810139, 0.810 novoSplice: heil-Sen slope= 0.770251, 0.770 STAR: heil-Sen slope= 0.313507, 0.314 HISAT2: heil-Sen slope= 0.000351, 0.000 Subjunc: heil-Sen slope= 0.197540, 0.198
Compare splice junctions detection of DRAGEN, novoSplice, STAR, HISAT2 and Subjunc against DeepSAP in human T1, T2, T3¶
In [1]:
from pdf2image import convert_from_path
from IPython.display import display, Image
from utility_sj import parse_featurecounts_junctions, detect_novel_junctions, process_featurecounts_and_novel_junctions, plot_grid_for_aligners, extract_cig_junctions, extract_cig_junctions, process_featurecounts_baruzzo , extract_gtf_junctions, extract_custom_gtf_junctions
from utility_sj import plot_grid_for_aligners_2columns_deepSAP_vs_rest
baruzzo_gtf = "/data/references/GRCh37_Baruzzo/simulator_config_geneinfo_hg19_GTF"
baruzzo_junctions = extract_custom_gtf_junctions(baruzzo_gtf)
orientation = "horizontal"
for MAPQ in ["0"]:
for T in ["1", "2", "3"]:
featurecounts_file = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/counts/sj_counts_baruzzo_human_t{T}r1_{MAPQ}.jcounts"
output_pdf = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/figures_pair/baruzzo_human_t{T}_MAPQ{MAPQ}_sj_pairwise.pdf"
output_png = f"/data/SpliceUp/DeepSAP-main/scripts/plots/featureCounts/figures_pair/baruzzo_human_t{T}_MAPQ{MAPQ}_sj_pairwise.png"
counted_junctions = parse_featurecounts_junctions(featurecounts_file)
novel_junctions = counted_junctions - baruzzo_junctions
print(f"Processing Baruzzo Human T{T}")
print("GTF junctions:\t", len(baruzzo_junctions))
print("FC junctions:\t", len(counted_junctions))
print("FC novel junctions:\t", len(novel_junctions))
df = process_featurecounts_and_novel_junctions(featurecounts_file, novel_junctions)
aligner_mapping = {
f'/data/RNAseqBenchmark/nextflow_May2024/cheesy_cajal/alignment/deepsap/DeepSAP__v0.0.1__GSNAP_24_6__baruzzo_human_t{T}r1.sorted.bam': 'DeepSAP',
f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_DRAGEN_v4.0.3/scruffy_pesquet/alignment/dragen/DRAGEN__05.121.645.4.0.3__default__baruzzo_human_t{T}r1.sorted.bam': 'DRAGEN',
f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/novoSplice/novoSplice__0.8.4__default__baruzzo_human_t{T}r1.sorted.bam': 'novoSplice',
f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/star/STAR__2.7.10a__default__baruzzo_human_t{T}r1.sorted.bam': 'STAR',
f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/hisat2/HISAT2__2.2.1__default__baruzzo_human_t{T}r1.sorted.bam': 'HISAT2',
f'/mnt/portable_disk/Fadel_rnaseq_data/RNAseqBenchmark/nextflow_final_run/ridiculous_plateau/alignment/subjunc/Subjunc__2.0.1__default__baruzzo_human_t{T}r1.sorted.bam': 'Subjunc'
}
# Rename columns in the DataFrame using the aligner_mapping dictionary
df.rename(columns=aligner_mapping, inplace=True)
plot_grid_for_aligners_2columns_deepSAP_vs_rest(df, "DeepSAP", [ "DRAGEN","novoSplice", "STAR", "HISAT2", "Subjunc"], output_pdf, log_scale=True, add_constant=1, limit=3, orient=orientation)
images = convert_from_path(output_pdf, dpi=300) # save image to png then showing it
resized_image = images[0].resize((images[0].width // 2, images[0].height // 2))
resized_image.save(output_png, 'PNG')
display(Image(output_png))
Processing Baruzzo Human T1 GTF junctions: 191293 FC junctions: 175019 FC novel junctions: 36217
Processing Baruzzo Human T2 GTF junctions: 191293 FC junctions: 195696 FC novel junctions: 56865
Processing Baruzzo Human T3 GTF junctions: 191293 FC junctions: 245470 FC novel junctions: 106359
In [ ]: